FPoliSolutions, LLC; Asset Monitoring and Predictive Maintenance¶

EDA¶

Arnab Dey Sarkar¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
In [ ]:
df=pd.read_csv("training_data.csv")
In [ ]:
df.shape
Out[ ]:
(223, 64)
In [ ]:
df.dtypes
Out[ ]:
X01    float64
X02    float64
X03    float64
X04    float64
X05    float64
        ...   
V26    float64
V27    float64
V28    float64
V29    float64
Y        int64
Length: 64, dtype: object
In [ ]:
df.isna().sum()
Out[ ]:
X01    0
X02    0
X03    0
X04    0
X05    0
      ..
V26    0
V27    0
V28    0
V29    0
Y      0
Length: 64, dtype: int64
In [ ]:
df.nunique()
Out[ ]:
X01    223
X02    223
X03    223
X04    223
X05    223
      ... 
V26    223
V27    223
V28    223
V29    223
Y        2
Length: 64, dtype: int64
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X01     223 non-null    float64
 1   X02     223 non-null    float64
 2   X03     223 non-null    float64
 3   X04     223 non-null    float64
 4   X05     223 non-null    float64
 5   X06     223 non-null    float64
 6   X07     223 non-null    float64
 7   X08     223 non-null    float64
 8   X09     223 non-null    float64
 9   X10     223 non-null    float64
 10  X11     223 non-null    float64
 11  X12     223 non-null    float64
 12  X13     223 non-null    float64
 13  X14     223 non-null    float64
 14  X15     223 non-null    float64
 15  X16     223 non-null    float64
 16  X17     223 non-null    float64
 17  X18     223 non-null    float64
 18  X19     223 non-null    float64
 19  X20     223 non-null    float64
 20  X21     223 non-null    float64
 21  X22     223 non-null    float64
 22  X23     223 non-null    float64
 23  X24     223 non-null    float64
 24  X25     223 non-null    float64
 25  Z01     223 non-null    float64
 26  Z02     223 non-null    float64
 27  Z03     223 non-null    float64
 28  Z04     223 non-null    float64
 29  Z05     223 non-null    float64
 30  Z06     223 non-null    float64
 31  Z07     223 non-null    float64
 32  Z08     223 non-null    float64
 33  Z09     223 non-null    float64
 34  V01     223 non-null    float64
 35  V02     223 non-null    float64
 36  V03     223 non-null    float64
 37  V04     223 non-null    float64
 38  V05     223 non-null    float64
 39  V06     223 non-null    float64
 40  V07     223 non-null    float64
 41  V08     223 non-null    float64
 42  V09     223 non-null    float64
 43  V10     223 non-null    float64
 44  V11     223 non-null    float64
 45  V12     223 non-null    float64
 46  V13     223 non-null    float64
 47  V14     223 non-null    float64
 48  V15     223 non-null    float64
 49  V16     223 non-null    float64
 50  V17     223 non-null    float64
 51  V18     223 non-null    float64
 52  V19     223 non-null    float64
 53  V20     223 non-null    float64
 54  V21     223 non-null    float64
 55  V22     223 non-null    float64
 56  V23     223 non-null    float64
 57  V24     223 non-null    float64
 58  V25     223 non-null    float64
 59  V26     223 non-null    float64
 60  V27     223 non-null    float64
 61  V28     223 non-null    float64
 62  V29     223 non-null    float64
 63  Y       223 non-null    int64  
dtypes: float64(63), int64(1)
memory usage: 111.6 KB

I will try to keep the last column Y as categorical because it has only two unique values but the rest as numeric.

In [ ]:
df.Y.value_counts()
Out[ ]:
1    138
0     85
Name: Y, dtype: int64
In [ ]:
df.describe(include='all')
Out[ ]:
X01 X02 X03 X04 X05 X06 X07 X08 X09 X10 ... V21 V22 V23 V24 V25 V26 V27 V28 V29 Y
count 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 2.230000e+02 ... 223.000000 223.000000 223.000000 223.000000 223.000000 223.000000 223.000000 223.000000 223.000000 223.000000
mean -6.372581e-17 -1.035544e-16 -1.194859e-16 6.521938e-17 -6.970010e-17 2.588861e-17 4.381149e-17 5.974294e-18 -2.190575e-17 -2.788004e-17 ... 0.022254 0.118454 0.080438 0.113141 0.216649 0.124892 0.949645 0.134703 -86.306579 0.618834
std 1.000479e+01 3.432796e+00 1.876591e+00 1.184369e+00 9.268016e-01 7.579259e-01 7.087896e-01 5.524554e-01 5.110352e-01 4.389306e-01 ... 1.511050 1.662396 1.719626 1.804158 2.106987 1.783030 0.037178 0.044716 46.306093 0.486766
min -2.877510e+01 -1.048100e+01 -5.941940e+00 -3.445607e+00 -3.593157e+00 -2.791616e+00 -2.043792e+00 -1.363138e+00 -1.566863e+00 -1.539497e+00 ... -3.778512 -5.687229 -6.953736 -6.790080 -10.541583 -5.783808 0.788204 0.077250 -182.745288 0.000000
25% -6.004972e+00 -2.137978e+00 -1.174047e+00 -6.187325e-01 -5.947222e-01 -5.259962e-01 -5.143738e-01 -3.909297e-01 -3.147642e-01 -2.929840e-01 ... -1.020518 -0.831423 -0.895099 -0.995807 -0.965790 -0.970357 0.932600 0.108477 -115.559745 0.000000
50% 1.597946e-01 2.169751e-01 8.468241e-02 3.652756e-02 -2.190519e-02 1.121122e-02 -3.799648e-02 -6.275792e-02 1.811046e-02 -3.987984e-02 ... -0.122100 0.111196 0.016036 0.025615 0.190205 0.032284 0.960524 0.127374 -91.517609 1.000000
75% 6.971508e+00 2.203972e+00 1.223745e+00 7.994909e-01 5.912108e-01 5.509297e-01 4.980837e-01 3.620381e-01 2.975528e-01 2.852263e-01 ... 1.002306 1.086965 1.133833 1.174133 1.465043 1.268731 0.976685 0.146303 -63.574713 1.000000
max 2.482622e+01 1.193088e+01 4.853514e+00 3.064266e+00 2.411752e+00 2.383175e+00 1.918046e+00 1.886586e+00 1.739986e+00 1.401598e+00 ... 5.958354 6.026561 6.528996 5.999120 6.558603 6.679002 0.993385 0.467255 162.318266 1.000000

8 rows × 64 columns

In [ ]:
lf= df.reset_index().\
rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'Y']).copy()
In [ ]:
lf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14049 entries, 0 to 14048
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   rowid     14049 non-null  int64  
 1   Y         14049 non-null  int64  
 2   variable  14049 non-null  object 
 3   value     14049 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 439.2+ KB
In [ ]:
df
Out[ ]:
X01 X02 X03 X04 X05 X06 X07 X08 X09 X10 ... V21 V22 V23 V24 V25 V26 V27 V28 V29 Y
0 -2.907070 1.266914 -0.332039 -0.248782 0.200432 -0.008683 0.316866 -0.323924 0.030199 -0.205569 ... 0.552065 0.489846 1.113175 -0.240931 -0.108875 -0.114766 0.841632 0.104236 -121.810994 1
1 -4.608052 4.672474 0.154697 0.268719 -0.842417 0.055191 0.622848 -0.260097 -0.651079 1.096821 ... 1.989505 1.355984 1.656029 2.428749 1.068637 1.945175 0.950544 0.143290 -59.362086 1
2 4.338816 5.684974 1.868370 -1.883006 0.589758 0.932240 -0.646026 0.183410 0.132287 -0.426386 ... 2.128248 2.553980 2.661607 2.625942 4.462401 3.621299 0.992381 0.127803 -79.575912 1
3 -1.835062 0.427501 -2.226023 0.700375 -1.144850 1.188100 0.727831 -0.271734 0.003246 0.138308 ... -0.856860 -0.766993 -0.882442 -0.832196 -0.377106 -0.633452 0.964183 0.088978 -139.426151 0
4 13.990969 -3.877269 1.921605 0.162288 2.316402 -0.161137 -0.099180 0.514620 -0.551956 -0.517779 ... -0.333291 0.126656 -0.557930 -0.185135 -0.639549 -0.245234 0.845817 0.172305 -29.433234 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
218 -3.454719 -0.911378 1.072785 0.845129 1.817682 -0.034388 -0.394277 -0.260803 0.018067 -0.718457 ... -1.041533 -0.585809 -0.831599 0.315496 -2.098443 -0.767479 0.941532 0.112127 -100.512718 0
219 0.493271 2.184699 0.107755 -1.852857 0.329977 0.679100 0.933463 0.110013 0.234102 -0.080677 ... 1.599908 1.485953 1.326105 0.738920 1.782694 1.221559 0.965555 0.098206 -118.299085 1
220 -5.009510 -0.488819 0.838883 0.615571 0.069954 -0.365945 -0.290072 0.777958 -0.392741 0.126245 ... -0.769891 -0.384573 -0.894474 -1.071848 -0.755456 -1.001084 0.898706 0.106729 -102.237684 1
221 -2.665672 -0.546497 -0.545406 -0.477273 1.476238 -0.019403 0.280312 0.359992 -0.136810 0.116397 ... -0.092393 -0.962929 -0.580874 -0.750556 -1.618888 -1.083649 0.964139 0.096673 -119.015441 1
222 0.055307 -1.709225 -0.763259 -0.755982 -0.106586 -1.174575 -0.608212 -0.483285 -0.055759 -0.658711 ... 0.076693 0.434874 0.371620 0.947893 0.775466 0.472459 0.914727 0.137963 -78.872248 1

223 rows × 64 columns

In [ ]:
lf
Out[ ]:
rowid Y variable value
0 0 1 X01 -2.907070
1 1 1 X01 -4.608052
2 2 1 X01 4.338816
3 3 0 X01 -1.835062
4 4 1 X01 13.990969
... ... ... ... ...
14044 218 0 V29 -100.512718
14045 219 1 V29 -118.299085
14046 220 1 V29 -102.237684
14047 221 1 V29 -119.015441
14048 222 1 V29 -78.872248

14049 rows × 4 columns

In [ ]:
lf.variable.value_counts()
Out[ ]:
X01    223
V14    223
V01    223
V02    223
V03    223
      ... 
Z02    223
Z03    223
Z04    223
Z05    223
V29    223
Name: variable, Length: 63, dtype: int64
In [ ]:
sns.displot(data=lf, kind='hist', x='value', col='variable', col_wrap=8,
            common_bins=False, common_norm=False,
            facet_kws={'sharex':False, 'sharey': False})

plt.show()

Definitely there are symmetric variables like X01, X02, X09, X12, Z02, Z04, V02, V04, V06, V07 etc.

Some are not e.g., X10, X19, X22 are bimodal, Z07 is left-skewed, Z08 is right skewed.

In [ ]:
sns.displot(data=lf, kind='kde', x='value', col='variable', col_wrap=8,
            hue='Y', common_norm=False,
            facet_kws={'sharex':False, 'sharey': False})

plt.show()

Usually the kde plots are similar accross the categories but in some cases e.g., X19 Y=1 is bimodal but Y=0 is unimodal. In V and Z variable Y=1 has more concentration to the right of Y=0.

In [ ]:
sns.catplot(data=df, kind='count', x='Y')

plt.show()
In [ ]:
df_part_features = df[['X01','X02','Z01','Z02', 'V01', 'V02','Y' ]].copy()
df_part_features.melt(ignore_index=False)
lf_part = df_part_features.reset_index().\
rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'Y'])
In [ ]:
lf_part
Out[ ]:
rowid Y variable value
0 0 1 X01 -2.907070
1 1 1 X01 -4.608052
2 2 1 X01 4.338816
3 3 0 X01 -1.835062
4 4 1 X01 13.990969
... ... ... ... ...
1333 218 0 V02 0.046036
1334 219 1 V02 0.168402
1335 220 1 V02 -0.041844
1336 221 1 V02 -0.208668
1337 222 1 V02 -0.080534

1338 rows × 4 columns

In [ ]:
df_clean=df.dropna().copy()
In [ ]:
df_clean['Y']=df_clean.Y.astype('category')
In [ ]:
df_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   X01     223 non-null    float64 
 1   X02     223 non-null    float64 
 2   X03     223 non-null    float64 
 3   X04     223 non-null    float64 
 4   X05     223 non-null    float64 
 5   X06     223 non-null    float64 
 6   X07     223 non-null    float64 
 7   X08     223 non-null    float64 
 8   X09     223 non-null    float64 
 9   X10     223 non-null    float64 
 10  X11     223 non-null    float64 
 11  X12     223 non-null    float64 
 12  X13     223 non-null    float64 
 13  X14     223 non-null    float64 
 14  X15     223 non-null    float64 
 15  X16     223 non-null    float64 
 16  X17     223 non-null    float64 
 17  X18     223 non-null    float64 
 18  X19     223 non-null    float64 
 19  X20     223 non-null    float64 
 20  X21     223 non-null    float64 
 21  X22     223 non-null    float64 
 22  X23     223 non-null    float64 
 23  X24     223 non-null    float64 
 24  X25     223 non-null    float64 
 25  Z01     223 non-null    float64 
 26  Z02     223 non-null    float64 
 27  Z03     223 non-null    float64 
 28  Z04     223 non-null    float64 
 29  Z05     223 non-null    float64 
 30  Z06     223 non-null    float64 
 31  Z07     223 non-null    float64 
 32  Z08     223 non-null    float64 
 33  Z09     223 non-null    float64 
 34  V01     223 non-null    float64 
 35  V02     223 non-null    float64 
 36  V03     223 non-null    float64 
 37  V04     223 non-null    float64 
 38  V05     223 non-null    float64 
 39  V06     223 non-null    float64 
 40  V07     223 non-null    float64 
 41  V08     223 non-null    float64 
 42  V09     223 non-null    float64 
 43  V10     223 non-null    float64 
 44  V11     223 non-null    float64 
 45  V12     223 non-null    float64 
 46  V13     223 non-null    float64 
 47  V14     223 non-null    float64 
 48  V15     223 non-null    float64 
 49  V16     223 non-null    float64 
 50  V17     223 non-null    float64 
 51  V18     223 non-null    float64 
 52  V19     223 non-null    float64 
 53  V20     223 non-null    float64 
 54  V21     223 non-null    float64 
 55  V22     223 non-null    float64 
 56  V23     223 non-null    float64 
 57  V24     223 non-null    float64 
 58  V25     223 non-null    float64 
 59  V26     223 non-null    float64 
 60  V27     223 non-null    float64 
 61  V28     223 non-null    float64 
 62  V29     223 non-null    float64 
 63  Y       223 non-null    category
dtypes: category(1), float64(63)
memory usage: 110.2 KB
In [ ]:
fig, ax = plt.subplots(figsize=(16,16))

sns.heatmap(data = df_clean.corr(numeric_only=True),
            vmin=-1, vmax=1, center=0,
            cmap='coolwarm', cbar=False,
            ax=ax)

plt.show()
In [ ]:
groups=df_clean.Y.unique().tolist()
groups
Out[ ]:
[1, 0]
In [ ]:
corr_per_group=df_clean.groupby('Y').corr()
In [ ]:
corr_per_group
Out[ ]:
X01 X02 X03 X04 X05 X06 X07 X08 X09 X10 ... V20 V21 V22 V23 V24 V25 V26 V27 V28 V29
Y
0 X01 1.000000 -0.205255 -0.087020 0.181176 -0.106155 0.022670 0.203954 -0.116990 -0.117701 -0.052425 ... 0.489908 0.472819 0.402452 0.469191 0.434905 0.385855 0.496872 -0.214225 -0.457191 -0.407074
X02 -0.205255 1.000000 -0.000509 0.068710 0.028603 0.046520 -0.107031 0.021405 -0.192939 -0.014649 ... 0.624683 0.539181 0.587521 0.554541 0.491096 0.469262 0.493617 -0.339179 -0.212051 -0.202974
X03 -0.087020 -0.000509 1.000000 0.101524 0.037781 -0.004955 -0.129379 -0.099406 0.025789 -0.081244 ... -0.003276 0.094116 0.134313 0.178055 0.048545 0.200277 0.143240 -0.088684 -0.077255 -0.133750
X04 0.181176 0.068710 0.101524 1.000000 -0.021093 0.114410 0.116193 0.093050 0.066907 0.004890 ... -0.064748 -0.138889 -0.228806 -0.112803 -0.163159 -0.196220 -0.180789 -0.024765 -0.147766 -0.108528
X05 -0.106155 0.028603 0.037781 -0.021093 1.000000 0.011185 -0.165383 0.146792 0.070581 -0.049404 ... -0.045776 -0.078500 -0.180840 -0.218230 -0.173990 -0.193015 -0.213615 0.048804 0.070151 0.103119
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1 V25 0.449221 0.535932 0.207738 -0.319819 -0.171985 -0.109611 -0.093405 -0.094394 0.120891 -0.064388 ... 0.728426 0.784733 0.757744 0.860391 0.773741 1.000000 0.887672 0.301572 0.384675 0.362597
V26 0.509917 0.642393 0.209614 -0.329812 -0.177260 -0.125603 -0.028217 -0.056534 0.093481 -0.102301 ... 0.857264 0.900553 0.880138 0.942585 0.946392 0.887672 1.000000 0.298503 0.402619 0.366128
V27 0.020993 0.255429 0.191936 -0.078613 -0.123966 -0.051715 0.009738 0.008055 -0.026474 0.057121 ... 0.172014 0.282729 0.233793 0.286658 0.300219 0.301572 0.298503 1.000000 -0.156757 -0.175995
V28 0.481938 0.099421 -0.056146 -0.048289 -0.303781 0.006422 -0.132591 0.104939 0.089362 -0.041137 ... 0.359102 0.299891 0.337032 0.430226 0.338856 0.384675 0.402619 -0.156757 1.000000 0.961420
V29 0.487228 0.037701 -0.036594 -0.055237 -0.241634 0.026896 -0.116197 0.061452 0.045616 -0.030766 ... 0.318734 0.264227 0.306602 0.374609 0.306797 0.362597 0.366128 -0.175995 0.961420 1.000000

126 rows × 63 columns

Lets study correlation in groups now:

In [ ]:
fig, axs = plt.subplots(len(groups),1, figsize=(18, 18), sharex=True, sharey=True )

for ix in range(len(groups)):
    sns.heatmap( data = corr_per_group.loc[ groups[ ix ] ],
                 vmin=-1, vmax=1, center = 0,
                 cmap='coolwarm', cbar=False,
                 ax=axs[ix] )
    
    axs[ ix ].set_title('Y: %s' % groups[ ix ] )

plt.show()

Impossible to make minute observation but we can see that there are some strong possitive association in consecutive in V and negative in V27,V28,V29 with most of the other Vs.

In [ ]:
df_part_features1 = df[['X01','X02','Z01','Z02', 'V01', 'V02' ]].copy()
sns.pairplot(data=df_part_features1, diag_kws={'common_norm': False} )

plt.show()
In [ ]:
fig, ax = plt.subplots()

sns.heatmap(data = df_part_features1.corr(numeric_only=True),
            vmin=-1, vmax=1, center = 0, fmt='.3f',
            cmap='coolwarm',
            annot=True, annot_kws={'size': 10},
            ax=ax)

plt.show()

There is a good linear relationship between (X01, Z01),(X01, V01), (Z01, V01) and almost no relationship between (X01,V02).

In [ ]:
sns.pairplot(data=df_part_features, hue='Y', diag_kws={'common_norm': False})

plt.show()

I can see that Y=1 is more variable in relationship between continuous-continuous variables than Y=0. We can also observe another thing that it is difficult to isolate the data with catogories of Y so far. It should be evident because originally we have 63 feature variables. Later on we will come back to this problem when we will do PCA analysis.

In [ ]:
df_clean.shape
Out[ ]:
(223, 64)
In [ ]:
fig, axs = plt.subplots(2,3,figsize=(12,6))

sns.violinplot(data=df_clean, x='Y', y='X01', inner='quartile', ax=axs[0,0])
sns.violinplot(data=df_clean, x='Y', y='X02', inner='quartile', ax=axs[0,1])
sns.violinplot(data=df_clean, x='Y', y='Z01', inner='quartile', ax=axs[0,2])

sns.violinplot(data=df_clean, x='Y', y='Z02', inner='quartile', ax=axs[1,0])
sns.violinplot(data=df_clean, x='Y', y='V01', inner='quartile', ax=axs[1,1])
sns.violinplot(data=df_clean, x='Y', y='V02', inner='quartile', ax=axs[1,2])


plt.show()
In [ ]:
lf_part= df.reset_index().\
    rename(columns={'index':'rowid'}).\
    melt(id_vars=['rowid', 'Y'], 
    value_vars=['X01','X02','Z01','Z02', 'V01', 'V02']).copy()
In [ ]:
lf_part.columns
Out[ ]:
Index(['rowid', 'Y', 'variable', 'value'], dtype='object')
In [ ]:
sns.catplot(data=lf_part, x='Y', y='value', col='variable', kind='box',
            col_wrap=3, sharey=False)

plt.show()
In [ ]:
corr_per_group1 = df_clean.loc[ :, ['X01','X02','Z01','Z02', 'V01', 'V02','Y']].groupby(['Y']).corr()
In [ ]:
fig, axs = plt.subplots(1,len(groups), figsize=(18, 6), sharex=True, sharey=True )

for ix in range(len(groups)):
    sns.heatmap( data = corr_per_group1.loc[ groups[ ix ] ],
                 vmin=-1, vmax=1, center = 0,
                 cmap='coolwarm', cbar=False,
                 ax=axs[ix] )
    
    axs[ ix ].set_title('Y: %s' % groups[ ix ] )

plt.show()
In [ ]: